Aim: To pre-process raw data from GEO using RMA and to perform a differential gene expression analysis comparing COPD vs CONTROL group
Chronic Obstructive Pulmonary Disease (COPD) is characterized by emphysema and chronic bronchitis, it’s diagnosed using spirometry and clinical information which lead to a heterogeneous COPD patients. The ranking of non commutable diseases from the WHO estimates that COPD is in the top of mortality causes and tobacco is the main risk factor but different genetic variants have been associated with this disease.
Different researchers have analyzed COPD transcriptomics using high-throughput data such as microarrays and RNA-seq. We belive it would be relevant to unravel a robust gene expression signature for COPD patients regardless if it is from different experiments or laboratories.
This script is part of a meta-analysis and it amis to determine a common deferentially expressed genes. The input is an ExpressionSet object with data already curated to have a phenotype column named ‘DISEASE’ and the levels are CONTROL or COPD.
The results will be logFC, adjusted p-values, etc. per experiment and they will be used to perfom a meta-analysis.
We want transcriptomic experiments from GEO that have:
Lung tissue samples
COPD vs CONTROL group
This script needs the following files:
Data 1: Table of GSE experiments
Data 2: Raw data (.CEL, .TXT)
The script can be found in: /Users/user/Documents/Doctorado/Analysis/Meta-analysis_COPD/vignettes
PATH = here::here()
DATA_DIR = file.path(PATH,"data")
OUTPUT_DIR = file.path(PATH,"output_data")
FIG_DIR = file.path(PATH,"fig")
TODAY = Sys.Date()
knitr::knit_hooks$set(timeit = local({
now = NULL
function(before, options) {
if (before) {
now <<- Sys.time()
} else {
runtime = difftime(Sys.time(), now)
now <<- NULL
# use options$label if you want the chunk label as well
paste('Time for this code chunk:', as.character(runtime))
}
}})
)
knitr::opts_knit$set(root.dir = PATH)
knitr::opts_chunk$set(echo = TRUE,
timeit=TRUE,
warning=FALSE,
attr.output='style="max-height: 500px;"')
And the analysis is run in: /Users/user/Documents/Doctorado/Analysis/Meta-analysis_COPD
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
packages <- c("knitr",
"oligo",
"tidyverse",
"limma",
"SummarizedExperiment",
"DESeq2",
"stringr",
"org.Hs.eg.db",
"AnnotationDbi",
"vsn",
"recount")
# for(l in packages){
# if (!requireNamespace(l, quietly = TRUE)) {
# BiocManager::install(l)}
#}
Time for this code chunk: 0.0214450359344482
lapply(packages, library, character.only = TRUE)
## [[1]]
## [1] "knitr" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "oligo" "Biostrings" "XVector" "IRanges" "S4Vectors"
## [6] "stats4" "Biobase" "oligoClasses" "BiocGenerics" "parallel"
## [11] "knitr" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[3]]
## [1] "forcats" "stringr" "dplyr" "purrr" "readr"
## [6] "tidyr" "tibble" "ggplot2" "tidyverse" "oligo"
## [11] "Biostrings" "XVector" "IRanges" "S4Vectors" "stats4"
## [16] "Biobase" "oligoClasses" "BiocGenerics" "parallel" "knitr"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
##
## [[4]]
## [1] "limma" "forcats" "stringr" "dplyr" "purrr"
## [6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [11] "oligo" "Biostrings" "XVector" "IRanges" "S4Vectors"
## [16] "stats4" "Biobase" "oligoClasses" "BiocGenerics" "parallel"
## [21] "knitr" "stats" "graphics" "grDevices" "utils"
## [26] "datasets" "methods" "base"
##
## [[5]]
## [1] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [4] "GenomicRanges" "GenomeInfoDb" "limma"
## [7] "forcats" "stringr" "dplyr"
## [10] "purrr" "readr" "tidyr"
## [13] "tibble" "ggplot2" "tidyverse"
## [16] "oligo" "Biostrings" "XVector"
## [19] "IRanges" "S4Vectors" "stats4"
## [22] "Biobase" "oligoClasses" "BiocGenerics"
## [25] "parallel" "knitr" "stats"
## [28] "graphics" "grDevices" "utils"
## [31] "datasets" "methods" "base"
##
## [[6]]
## [1] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [4] "matrixStats" "GenomicRanges" "GenomeInfoDb"
## [7] "limma" "forcats" "stringr"
## [10] "dplyr" "purrr" "readr"
## [13] "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "oligo" "Biostrings"
## [19] "XVector" "IRanges" "S4Vectors"
## [22] "stats4" "Biobase" "oligoClasses"
## [25] "BiocGenerics" "parallel" "knitr"
## [28] "stats" "graphics" "grDevices"
## [31] "utils" "datasets" "methods"
## [34] "base"
##
## [[7]]
## [1] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [4] "matrixStats" "GenomicRanges" "GenomeInfoDb"
## [7] "limma" "forcats" "stringr"
## [10] "dplyr" "purrr" "readr"
## [13] "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "oligo" "Biostrings"
## [19] "XVector" "IRanges" "S4Vectors"
## [22] "stats4" "Biobase" "oligoClasses"
## [25] "BiocGenerics" "parallel" "knitr"
## [28] "stats" "graphics" "grDevices"
## [31] "utils" "datasets" "methods"
## [34] "base"
##
## [[8]]
## [1] "org.Hs.eg.db" "AnnotationDbi" "DESeq2"
## [4] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [7] "GenomicRanges" "GenomeInfoDb" "limma"
## [10] "forcats" "stringr" "dplyr"
## [13] "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse"
## [19] "oligo" "Biostrings" "XVector"
## [22] "IRanges" "S4Vectors" "stats4"
## [25] "Biobase" "oligoClasses" "BiocGenerics"
## [28] "parallel" "knitr" "stats"
## [31] "graphics" "grDevices" "utils"
## [34] "datasets" "methods" "base"
##
## [[9]]
## [1] "org.Hs.eg.db" "AnnotationDbi" "DESeq2"
## [4] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [7] "GenomicRanges" "GenomeInfoDb" "limma"
## [10] "forcats" "stringr" "dplyr"
## [13] "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse"
## [19] "oligo" "Biostrings" "XVector"
## [22] "IRanges" "S4Vectors" "stats4"
## [25] "Biobase" "oligoClasses" "BiocGenerics"
## [28] "parallel" "knitr" "stats"
## [31] "graphics" "grDevices" "utils"
## [34] "datasets" "methods" "base"
##
## [[10]]
## [1] "vsn" "org.Hs.eg.db" "AnnotationDbi"
## [4] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [7] "matrixStats" "GenomicRanges" "GenomeInfoDb"
## [10] "limma" "forcats" "stringr"
## [13] "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2"
## [19] "tidyverse" "oligo" "Biostrings"
## [22] "XVector" "IRanges" "S4Vectors"
## [25] "stats4" "Biobase" "oligoClasses"
## [28] "BiocGenerics" "parallel" "knitr"
## [31] "stats" "graphics" "grDevices"
## [34] "utils" "datasets" "methods"
## [37] "base"
##
## [[11]]
## [1] "recount" "vsn" "org.Hs.eg.db"
## [4] "AnnotationDbi" "DESeq2" "SummarizedExperiment"
## [7] "DelayedArray" "matrixStats" "GenomicRanges"
## [10] "GenomeInfoDb" "limma" "forcats"
## [13] "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble"
## [19] "ggplot2" "tidyverse" "oligo"
## [22] "Biostrings" "XVector" "IRanges"
## [25] "S4Vectors" "stats4" "Biobase"
## [28] "oligoClasses" "BiocGenerics" "parallel"
## [31] "knitr" "stats" "graphics"
## [34] "grDevices" "utils" "datasets"
## [37] "methods" "base"
Time for this code chunk: 18.9283258914948
We selected experiments from lung samples of COPD patients and that also have a control group to compare. These experiments are described in the following table:
gse_table <- read.csv(file.path(OUTPUT_DIR,"2020-09-03-GSE_Summary.csv"), row.names = 1)
kable(gse_table, caption = "GSE information")
| CONTROL | COPD | COUNTRY | SUBMISSION_DATE | PLATFORM | NUM_GENES | |
|---|---|---|---|---|---|---|
| GSE27597 | 16 | 48 | USA | Mar 01 2011 | GPL5175 | 7733 |
| GSE106986 | 5 | 14 | Germany | Nov 16 2017 | GPL13497 | 21690 |
| GSE37768 | 20 | 18 | Spain | May 04 2012 | GPL570 | 23521 |
| GSE57148 | 91 | 96 | South Korea | Apr 28 2014 | GPL11154 | 25288 |
| GSE47460 | 91 | 145 | USA | May 29 2013 | GPL14550 | 15181 |
| GSE38974 | 9 | 23 | USA | Jun 27 2012 | GPL4133 | 19750 |
| GSE8581 | 19 | 16 | USA | Jul 12 2007,Jul 13 2007,Jul 17 2007,Jul 20 2007,Jul 24 2007,Jul 25 2007 | GPL570 | 23521 |
| GSE1650 | 12 | 18 | USA | Aug 06 2004 | GPL96 | 13516 |
| GSE1122 | 5 | 10 | USA | Mar 09 2004 | GPL80 | 5615 |
| GSE119040 | 4 | 6 | Argentina | Aug 24 2018 | GPL96 | 13516 |
| GSE103174 | 16 | 37 | Spain | Aug 28 2017 | GPL13667 | 15021 |
| GSE124180 | 15 | 6 | USA | Dec 20 2018 | GPL16791 | 52393 |
Time for this code chunk: 0.0382189750671387
The experiment GSE57148 is a RNA-seq experiment, and we do not need to normalize the data using RMA but we will download counts from ReCount2.
We already downloaded the data using GEOquery and parsed the information using curateing_COPDinfo.Rscript. So we can read the .RDS object into our R session
geo <- readRDS(file.path(DATA_DIR,"2020-09-GSE_LungTissue-CURATED.RDS"))
names(geo)
## [1] "GSE27597" "GSE106986" "GSE37768" "GSE57148" "GSE47460" "GSE38974"
## [7] "GSE8581" "GSE1650" "GSE1122" "GSE119040" "GSE103174" "GSE124180"
Time for this code chunk: 12.8940010070801
Using this funtion, we get a table with differential expression gene results using limma package for fitting a linear model to get genes differentially expressed between a “Control” and a “COPD” group.
Input: GSE ID, optional: colCOPD is the column name in which the information of disease status can be found, coeff will show results of contrast with coeffitient found in possiton 2
Output: Table of differential expression results with all genes
DE <- function(ExpressionSet,colCOPD="DISEASE",coeff= "DISEASECOPD"){
# it creates the design matrix and performs limma
fit <- lmFit(ExpressionSet, model.matrix(as.formula(paste("~ 1 +", colCOPD)),
data = pData(ExpressionSet)))
# eBayes in lmFit model
ebf <- eBayes(fit)
print(colnames(coef(fit)))
# It gets the genes with the p-values
volcanoplot(ebf,coef = coeff,highlight=20, pch=20)
res <- topTable(ebf, number = Inf, p.value = 1, coef = coeff,confint=T)
# It formats in a tibble
res <- as_tibble(res,rownames="rownames")
}
Time for this code chunk: 0.00423002243041992
Names will be different but it is important to check that “Control” group is the first level. If need it re-level groups.
Using DE() function (described above), we performed a lineal regression model to calculate the logarithm fold change of all genes between a “Control” and a “COPD” group. We also rename colnames adding the GSE ID at the end and finally, we save the output in a .CSV file.
i <- 1
g <- geo[[i]][[2]]
boxplot(g)
hist(g)
de <- DE(g,colCOPD = "DISEASE + PATIENT")
## Coefficients not estimable: PATIENT6983
## [1] "(Intercept)" "DISEASECOPD" "PATIENT6967" "PATIENT6968" "PATIENT6969"
## [6] "PATIENT6970" "PATIENT6971" "PATIENT6982" "PATIENT6983"
colnames(de) <- str_c(colnames(de),"_",names(geo)[i])
colnames(de)
## [1] "rownames_GSE27597" "ID_GSE27597" "GB_LIST_GSE27597"
## [4] "GENE.SYMBOL_GSE27597" "logFC_GSE27597" "CI.L_GSE27597"
## [7] "CI.R_GSE27597" "AveExpr_GSE27597" "t_GSE27597"
## [10] "P.Value_GSE27597" "adj.P.Val_GSE27597" "B_GSE27597"
write_csv(de,
path=str_c(OUTPUT_DIR,"/DE/",TODAY,"_TableGenes_",names(geo[i]),".csv")
)
de1 <- de
Time for this code chunk: 3.49025702476501
i <- 2
g <- geo[[i]][[1]]
boxplot(g)
hist(g)
de <- DE(g,colCOPD = "DISEASE")
## [1] "(Intercept)" "DISEASECOPD"
colnames(de) <- str_c(colnames(de),"_",names(geo)[i])
colnames(de)
## [1] "rownames_GSE106986" "ID_GSE106986"
## [3] "SPOT_ID_GSE106986" "CONTROL_TYPE_GSE106986"
## [5] "REFSEQ_GSE106986" "GB_ACC_GSE106986"
## [7] "GENE_GSE106986" "GENE_SYMBOL_GSE106986"
## [9] "GENE_NAME_GSE106986" "UNIGENE_ID_GSE106986"
## [11] "ENSEMBL_ID_GSE106986" "TIGR_ID_GSE106986"
## [13] "ACCESSION_STRING_GSE106986" "CHROMOSOMAL_LOCATION_GSE106986"
## [15] "CYTOBAND_GSE106986" "DESCRIPTION_GSE106986"
## [17] "GO_ID_GSE106986" "SEQUENCE_GSE106986"
## [19] "GENE.SYMBOL_GSE106986" "logFC_GSE106986"
## [21] "CI.L_GSE106986" "CI.R_GSE106986"
## [23] "AveExpr_GSE106986" "t_GSE106986"
## [25] "P.Value_GSE106986" "adj.P.Val_GSE106986"
## [27] "B_GSE106986"
write_csv(de,
path=str_c(OUTPUT_DIR,"/DE/",TODAY,"_TableGenes_",names(geo[i]),".csv")
)
de2 <- de
Time for this code chunk: 3.27422618865967
i <- 3
g <- geo[[i]][[1]]
boxplot(g)
hist(g)
de <- DE(g,colCOPD = "DISEASE")
## [1] "(Intercept)" "DISEASECOPD"
colnames(de) <- str_c(colnames(de),"_",names(geo)[i])
colnames(de)
## [1] "rownames_GSE37768"
## [2] "ID_GSE37768"
## [3] "GB_ACC_GSE37768"
## [4] "SPOT_ID_GSE37768"
## [5] "Species.Scientific.Name_GSE37768"
## [6] "Annotation.Date_GSE37768"
## [7] "Sequence.Type_GSE37768"
## [8] "Sequence.Source_GSE37768"
## [9] "Target.Description_GSE37768"
## [10] "Representative.Public.ID_GSE37768"
## [11] "Gene.Title_GSE37768"
## [12] "Gene.Symbol_GSE37768"
## [13] "ENTREZ_GENE_ID_GSE37768"
## [14] "RefSeq.Transcript.ID_GSE37768"
## [15] "Gene.Ontology.Biological.Process_GSE37768"
## [16] "Gene.Ontology.Cellular.Component_GSE37768"
## [17] "Gene.Ontology.Molecular.Function_GSE37768"
## [18] "GENE.SYMBOL_GSE37768"
## [19] "logFC_GSE37768"
## [20] "CI.L_GSE37768"
## [21] "CI.R_GSE37768"
## [22] "AveExpr_GSE37768"
## [23] "t_GSE37768"
## [24] "P.Value_GSE37768"
## [25] "adj.P.Val_GSE37768"
## [26] "B_GSE37768"
write_csv(de,
path=str_c(OUTPUT_DIR,"/DE/",TODAY,"_TableGenes_",names(geo[i]),".csv")
)
de3 <- de
Time for this code chunk: 7.77078485488892
This experiment is RNAseq, the data will be download from Recount2. I’m following Recount2 vignette Using DESeq2package we calculated DEG.
## Specify design and switch to DESeq2 format
i <- 4
rse <- geo[[i]][[1]]
dds <- DESeqDataSet(rse, ~ DISEASE)
## converting counts to integer mode
## Perform DE analysis
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 1439 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
ntd <- normTransform(dds)
meanSdPlot(assay(ntd))
res <- results(dds)
plotMA(res, ylim=c(-2,2))
# Calculates de CI
res$error <- qnorm(0.975)*res$lfcSE
res$CI.L <- res$log2FoldChange-res$error
res$CI.R <- res$log2FoldChange+res$error
res
## log2 fold change (MLE): DISEASE COPD vs CONTROL
## Wald test p-value: DISEASE COPD vs CONTROL
## DataFrame with 58037 rows and 9 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.14 924.46223 -0.1311028 0.0763894 -1.716244 0.0861174
## ENSG00000000005.5 2.85737 -0.4396692 0.4629495 -0.949713 0.3422580
## ENSG00000000419.12 1089.25569 0.0152275 0.0300546 0.506663 0.6123911
## ENSG00000000457.13 698.75730 -0.0656469 0.0405149 -1.620314 0.1051648
## ENSG00000000460.16 347.79272 -0.0502240 0.0465180 -1.079669 0.2802896
## ... ... ... ... ... ...
## ENSG00000283695.1 0.0454355 -0.1779953 1.9047251 -0.0934493 9.25547e-01
## ENSG00000283696.1 37.5449930 0.5120456 0.0933836 5.4832482 4.17586e-08
## ENSG00000283697.1 21.2156969 0.0782234 0.1087300 0.7194274 4.71878e-01
## ENSG00000283698.1 0.1327733 -0.1294404 0.9741244 -0.1328787 8.94289e-01
## ENSG00000283699.1 0.0646975 0.1592290 1.3843041 0.1150246 9.08426e-01
## padj error CI.L CI.R
## <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.14 0.177940 0.1497204 -0.2808232 0.0186176
## ENSG00000000005.5 0.502138 0.9073642 -1.3470334 0.4676951
## ENSG00000000419.12 0.743547 0.0589059 -0.0436783 0.0741334
## ENSG00000000457.13 0.207758 0.0794078 -0.1450547 0.0137609
## ENSG00000000460.16 0.435330 0.0911736 -0.1413976 0.0409495
## ... ... ... ... ...
## ENSG00000283695.1 NA 3.733193 -3.911188 3.555197
## ENSG00000283696.1 5.80204e-07 0.183029 0.329017 0.695074
## ENSG00000283697.1 6.26366e-01 0.213107 -0.134884 0.291330
## ENSG00000283698.1 NA 1.909249 -2.038689 1.779808
## ENSG00000283699.1 NA 2.713186 -2.553957 2.872415
## Extract Gencode gene ids
gencode <- gsub('\\..*', '', names(recount_genes))
## Find the gene information we are interested in
gene_info <- AnnotationDbi::select(org.Hs.eg.db, gencode, c('SYMBOL', 'ENSEMBL'), 'ENSEMBL')
## 'select()' returned many:many mapping between keys and columns
r <- as_tibble(res, rownames="rownames")
r$rownames <- gsub("\\..*","",r$rownames)
r <- full_join(r,gene_info, by=c("rownames"="ENSEMBL"))
r$GENE.SYMBOL <- r$SYMBOL
r$logFC <- r$log2FoldChange
colnames(r) <- str_c(colnames(r),"_",names(geo[i]))
colnames(r)
## [1] "rownames_GSE57148" "baseMean_GSE57148"
## [3] "log2FoldChange_GSE57148" "lfcSE_GSE57148"
## [5] "stat_GSE57148" "pvalue_GSE57148"
## [7] "padj_GSE57148" "error_GSE57148"
## [9] "CI.L_GSE57148" "CI.R_GSE57148"
## [11] "SYMBOL_GSE57148" "GENE.SYMBOL_GSE57148"
## [13] "logFC_GSE57148"
write_csv(r,
path=str_c(OUTPUT_DIR,"/DE/",TODAY,"_TableGenes_",names(geo[i]),".csv"))
de4 <- r
Time for this code chunk: 3.2814636349678
i <- 5
g <- geo[[i]][[1]]
boxplot(g)
hist(g)
de <- DE(g,colCOPD = "DISEASE")
## [1] "(Intercept)" "DISEASECOPD"
## [3] "DISEASEInterstitial lung disease"
colnames(de) <- str_c(colnames(de),"_",names(geo)[i])
colnames(de)
## [1] "rownames_GSE47460" "ID_GSE47460"
## [3] "SPOT_ID_GSE47460" "CONTROL_TYPE_GSE47460"
## [5] "REFSEQ_GSE47460" "GB_ACC_GSE47460"
## [7] "GENE_GSE47460" "GENE_SYMBOL_GSE47460"
## [9] "GENE_NAME_GSE47460" "UNIGENE_ID_GSE47460"
## [11] "ENSEMBL_ID_GSE47460" "TIGR_ID_GSE47460"
## [13] "ACCESSION_STRING_GSE47460" "CHROMOSOMAL_LOCATION_GSE47460"
## [15] "CYTOBAND_GSE47460" "DESCRIPTION_GSE47460"
## [17] "GO_ID_GSE47460" "SEQUENCE_GSE47460"
## [19] "GENE.SYMBOL_GSE47460" "logFC_GSE47460"
## [21] "CI.L_GSE47460" "CI.R_GSE47460"
## [23] "AveExpr_GSE47460" "t_GSE47460"
## [25] "P.Value_GSE47460" "adj.P.Val_GSE47460"
## [27] "B_GSE47460"
write_csv(de,
path=str_c(OUTPUT_DIR,"/DE/",TODAY,"_TableGenes_",names(geo[i]),".csv")
)
de5 <- de
Time for this code chunk: 6.50487589836121
i <- 6
g <- geo[[i]][[1]]
boxplot(g)
hist(g)
de <- DE(g,colCOPD = "DISEASE")
## [1] "(Intercept)" "DISEASECOPD"
colnames(de) <- str_c(colnames(de),"_",names(geo)[i])
colnames(de)
## [1] "rownames_GSE38974" "ID_GSE38974"
## [3] "COL_GSE38974" "ROW_GSE38974"
## [5] "NAME_GSE38974" "SPOT_ID_GSE38974"
## [7] "CONTROL_TYPE_GSE38974" "REFSEQ_GSE38974"
## [9] "GB_ACC_GSE38974" "GENE_GSE38974"
## [11] "GENE_SYMBOL_GSE38974" "GENE_NAME_GSE38974"
## [13] "UNIGENE_ID_GSE38974" "ENSEMBL_ID_GSE38974"
## [15] "TIGR_ID_GSE38974" "ACCESSION_STRING_GSE38974"
## [17] "CHROMOSOMAL_LOCATION_GSE38974" "CYTOBAND_GSE38974"
## [19] "DESCRIPTION_GSE38974" "GO_ID_GSE38974"
## [21] "SEQUENCE_GSE38974" "SPOT_ID_1_GSE38974"
## [23] "ORDER_GSE38974" "GENE.SYMBOL_GSE38974"
## [25] "logFC_GSE38974" "CI.L_GSE38974"
## [27] "CI.R_GSE38974" "AveExpr_GSE38974"
## [29] "t_GSE38974" "P.Value_GSE38974"
## [31] "adj.P.Val_GSE38974" "B_GSE38974"
write_csv(de,
path=str_c(OUTPUT_DIR,"/DE/",TODAY,"_TableGenes_",names(geo[i]),".csv")
)
de6 <- de
Time for this code chunk: 4.37785387039185
i <- 7
g <- geo[[i]][[1]]
boxplot(g)
hist(g)
de <- DE(g,colCOPD = "DISEASE")
## [1] "(Intercept)" "DISEASECOPD" "DISEASEUnclassified"
colnames(de) <- str_c(colnames(de),"_",names(geo)[i])
colnames(de)
## [1] "rownames_GSE8581"
## [2] "ID_GSE8581"
## [3] "GB_ACC_GSE8581"
## [4] "SPOT_ID_GSE8581"
## [5] "Species.Scientific.Name_GSE8581"
## [6] "Annotation.Date_GSE8581"
## [7] "Sequence.Type_GSE8581"
## [8] "Sequence.Source_GSE8581"
## [9] "Target.Description_GSE8581"
## [10] "Representative.Public.ID_GSE8581"
## [11] "Gene.Title_GSE8581"
## [12] "Gene.Symbol_GSE8581"
## [13] "ENTREZ_GENE_ID_GSE8581"
## [14] "RefSeq.Transcript.ID_GSE8581"
## [15] "Gene.Ontology.Biological.Process_GSE8581"
## [16] "Gene.Ontology.Cellular.Component_GSE8581"
## [17] "Gene.Ontology.Molecular.Function_GSE8581"
## [18] "GENE.SYMBOL_GSE8581"
## [19] "logFC_GSE8581"
## [20] "CI.L_GSE8581"
## [21] "CI.R_GSE8581"
## [22] "AveExpr_GSE8581"
## [23] "t_GSE8581"
## [24] "P.Value_GSE8581"
## [25] "adj.P.Val_GSE8581"
## [26] "B_GSE8581"
write_csv(de,
path=str_c(OUTPUT_DIR,"/DE/",TODAY,"_TableGenes_",names(geo[i]),".csv")
)
de7 <- de
Time for this code chunk: 7.19492101669312
i <- 8
g <- geo[[i]][[1]]
boxplot(g)
hist(g)
de <- DE(g,colCOPD = "DISEASE")
## [1] "(Intercept)" "DISEASECOPD"
colnames(de) <- str_c(colnames(de),"_",names(geo)[i])
colnames(de)
## [1] "rownames_GSE1650"
## [2] "ID_GSE1650"
## [3] "GB_ACC_GSE1650"
## [4] "SPOT_ID_GSE1650"
## [5] "Species.Scientific.Name_GSE1650"
## [6] "Annotation.Date_GSE1650"
## [7] "Sequence.Type_GSE1650"
## [8] "Sequence.Source_GSE1650"
## [9] "Target.Description_GSE1650"
## [10] "Representative.Public.ID_GSE1650"
## [11] "Gene.Title_GSE1650"
## [12] "Gene.Symbol_GSE1650"
## [13] "ENTREZ_GENE_ID_GSE1650"
## [14] "RefSeq.Transcript.ID_GSE1650"
## [15] "Gene.Ontology.Biological.Process_GSE1650"
## [16] "Gene.Ontology.Cellular.Component_GSE1650"
## [17] "Gene.Ontology.Molecular.Function_GSE1650"
## [18] "GENE.SYMBOL_GSE1650"
## [19] "logFC_GSE1650"
## [20] "CI.L_GSE1650"
## [21] "CI.R_GSE1650"
## [22] "AveExpr_GSE1650"
## [23] "t_GSE1650"
## [24] "P.Value_GSE1650"
## [25] "adj.P.Val_GSE1650"
## [26] "B_GSE1650"
write_csv(de,
path=str_c(OUTPUT_DIR,"/DE/",TODAY,"_TableGenes_",names(geo[i]),".csv")
)
de8 <- de
Time for this code chunk: 4.06880402565002
i <- 9
g <- geo[[i]][[1]]
boxplot(g)
hist(g)
de <- DE(g,colCOPD = "DISEASE")
## [1] "(Intercept)" "DISEASECOPD"
colnames(de) <- str_c(colnames(de),"_",names(geo)[i])
colnames(de)
## [1] "rownames_GSE1122"
## [2] "ID_GSE1122"
## [3] "GB_ACC_GSE1122"
## [4] "SPOT_ID_GSE1122"
## [5] "Species.Scientific.Name_GSE1122"
## [6] "Annotation.Date_GSE1122"
## [7] "Sequence.Type_GSE1122"
## [8] "Sequence.Source_GSE1122"
## [9] "Target.Description_GSE1122"
## [10] "Representative.Public.ID_GSE1122"
## [11] "Gene.Title_GSE1122"
## [12] "Gene.Symbol_GSE1122"
## [13] "ENTREZ_GENE_ID_GSE1122"
## [14] "RefSeq.Transcript.ID_GSE1122"
## [15] "Gene.Ontology.Biological.Process_GSE1122"
## [16] "Gene.Ontology.Cellular.Component_GSE1122"
## [17] "Gene.Ontology.Molecular.Function_GSE1122"
## [18] "GENE.SYMBOL_GSE1122"
## [19] "logFC_GSE1122"
## [20] "CI.L_GSE1122"
## [21] "CI.R_GSE1122"
## [22] "AveExpr_GSE1122"
## [23] "t_GSE1122"
## [24] "P.Value_GSE1122"
## [25] "adj.P.Val_GSE1122"
## [26] "B_GSE1122"
write_csv(de,
path=str_c(OUTPUT_DIR,"/DE/",TODAY,"_TableGenes_",names(geo[i]),".csv")
)
de9 <- de
Time for this code chunk: 1.65178990364075
i <- 10
g <- geo[[i]][[1]]
boxplot(g)
hist(g)
de <- DE(g,colCOPD = "DISEASE")
## [1] "(Intercept)" "DISEASECOPD"
colnames(de) <- str_c(colnames(de),"_",names(geo)[i])
colnames(de)
## [1] "rownames_GSE119040"
## [2] "ID_GSE119040"
## [3] "GB_ACC_GSE119040"
## [4] "SPOT_ID_GSE119040"
## [5] "Species.Scientific.Name_GSE119040"
## [6] "Annotation.Date_GSE119040"
## [7] "Sequence.Type_GSE119040"
## [8] "Sequence.Source_GSE119040"
## [9] "Target.Description_GSE119040"
## [10] "Representative.Public.ID_GSE119040"
## [11] "Gene.Title_GSE119040"
## [12] "Gene.Symbol_GSE119040"
## [13] "ENTREZ_GENE_ID_GSE119040"
## [14] "RefSeq.Transcript.ID_GSE119040"
## [15] "Gene.Ontology.Biological.Process_GSE119040"
## [16] "Gene.Ontology.Cellular.Component_GSE119040"
## [17] "Gene.Ontology.Molecular.Function_GSE119040"
## [18] "GENE.SYMBOL_GSE119040"
## [19] "logFC_GSE119040"
## [20] "CI.L_GSE119040"
## [21] "CI.R_GSE119040"
## [22] "AveExpr_GSE119040"
## [23] "t_GSE119040"
## [24] "P.Value_GSE119040"
## [25] "adj.P.Val_GSE119040"
## [26] "B_GSE119040"
write_csv(de,
path=str_c(OUTPUT_DIR,"/DE/",TODAY,"_TableGenes_",names(geo[i]),".csv")
)
de10 <- de
Time for this code chunk: 3.52170705795288
i <- 11
g <- geo[[i]][[1]]
boxplot(g)
hist(g)
de <- DE(g,colCOPD = "DISEASE")
## [1] "(Intercept)" "DISEASECOPD"
colnames(de) <- str_c(colnames(de),"_",names(geo)[i])
colnames(de)
## [1] "rownames_GSE103174"
## [2] "ID_GSE103174"
## [3] "GeneChip.Array_GSE103174"
## [4] "Species.Scientific.Name_GSE103174"
## [5] "Annotation.Date_GSE103174"
## [6] "Sequence.Type_GSE103174"
## [7] "Sequence.Source_GSE103174"
## [8] "Transcript.ID.Array.Design._GSE103174"
## [9] "Target.Description_GSE103174"
## [10] "Representative.Public.ID_GSE103174"
## [11] "Archival.UniGene.Cluster_GSE103174"
## [12] "UniGene.ID_GSE103174"
## [13] "Genome.Version_GSE103174"
## [14] "Alignments_GSE103174"
## [15] "Gene.Title_GSE103174"
## [16] "Gene.Symbol_GSE103174"
## [17] "Chromosomal.Location_GSE103174"
## [18] "GB_LIST_GSE103174"
## [19] "SPOT_ID_GSE103174"
## [20] "Unigene.Cluster.Type_GSE103174"
## [21] "Ensembl_GSE103174"
## [22] "Entrez.Gene_GSE103174"
## [23] "SwissProt_GSE103174"
## [24] "EC_GSE103174"
## [25] "OMIM_GSE103174"
## [26] "RefSeq.Protein.ID_GSE103174"
## [27] "RefSeq.Transcript.ID_GSE103174"
## [28] "FlyBase_GSE103174"
## [29] "AGI_GSE103174"
## [30] "WormBase_GSE103174"
## [31] "MGI.Name_GSE103174"
## [32] "RGD.Name_GSE103174"
## [33] "SGD.accession.number_GSE103174"
## [34] "Gene.Ontology.Biological.Process_GSE103174"
## [35] "Gene.Ontology.Cellular.Component_GSE103174"
## [36] "Gene.Ontology.Molecular.Function_GSE103174"
## [37] "Pathway_GSE103174"
## [38] "InterPro_GSE103174"
## [39] "Trans.Membrane_GSE103174"
## [40] "QTL_GSE103174"
## [41] "Annotation.Description_GSE103174"
## [42] "Annotation.Transcript.Cluster_GSE103174"
## [43] "Transcript.Assignments_GSE103174"
## [44] "Annotation.Notes_GSE103174"
## [45] "GENE.SYMBOL_GSE103174"
## [46] "logFC_GSE103174"
## [47] "CI.L_GSE103174"
## [48] "CI.R_GSE103174"
## [49] "AveExpr_GSE103174"
## [50] "t_GSE103174"
## [51] "P.Value_GSE103174"
## [52] "adj.P.Val_GSE103174"
## [53] "B_GSE103174"
write_csv(de,
path=str_c(OUTPUT_DIR,"/DE/",TODAY,"_TableGenes_",names(geo[i]),".csv")
)
de11 <- de
Time for this code chunk: 4.22814893722534
This script produces the following data, and can be found in /Users/user/Documents/Doctorado/Analysis/Meta-analysis_COPD
Tables with DE results: Tables with log fold change and p-values calculated
Table of merged results: Table with all DE results
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] recount_1.14.0 vsn_3.56.0
## [3] org.Hs.eg.db_3.11.4 AnnotationDbi_1.50.3
## [5] DESeq2_1.28.1 SummarizedExperiment_1.18.2
## [7] DelayedArray_0.14.1 matrixStats_0.56.0
## [9] GenomicRanges_1.40.0 GenomeInfoDb_1.24.2
## [11] limma_3.44.3 forcats_0.5.0
## [13] stringr_1.4.0 dplyr_1.0.2.9000
## [15] purrr_0.3.4 readr_1.3.1
## [17] tidyr_1.1.2 tibble_3.0.3
## [19] ggplot2_3.3.2 tidyverse_1.3.0
## [21] oligo_1.52.1 Biostrings_2.56.0
## [23] XVector_0.28.0 IRanges_2.22.2
## [25] S4Vectors_0.26.1 Biobase_2.48.0
## [27] oligoClasses_1.50.4 BiocGenerics_0.34.0
## [29] knitr_1.29
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.9 Hmisc_4.4-1
## [4] BiocFileCache_1.12.1 plyr_1.8.6 splines_4.0.2
## [7] BiocParallel_1.22.0 digest_0.6.25 foreach_1.5.0
## [10] htmltools_0.5.0 fansi_0.4.1 checkmate_2.0.0
## [13] magrittr_1.5 memoise_1.1.0 BSgenome_1.56.0
## [16] cluster_2.1.0 annotate_1.66.0 modelr_0.1.8
## [19] askpass_1.1 prettyunits_1.1.1 jpeg_0.1-8.1
## [22] colorspace_1.4-1 blob_1.2.1 rvest_0.3.6
## [25] rappdirs_0.3.1 haven_2.3.1 xfun_0.16
## [28] hexbin_1.28.1 crayon_1.3.4 RCurl_1.98-1.2
## [31] jsonlite_1.7.0 genefilter_1.70.0 GEOquery_2.56.0
## [34] survival_3.2-3 VariantAnnotation_1.34.0 iterators_1.0.12
## [37] glue_1.4.2 gtable_0.3.0 zlibbioc_1.34.0
## [40] rentrez_1.2.2 scales_1.1.1 rngtools_1.5
## [43] DBI_1.1.0 derfinderHelper_1.22.0 derfinder_1.22.0
## [46] Rcpp_1.0.5 htmlTable_2.0.1 xtable_1.8-4
## [49] progress_1.2.2 bumphunter_1.30.0 foreign_0.8-80
## [52] bit_4.0.4 preprocessCore_1.50.0 Formula_1.2-3
## [55] htmlwidgets_1.5.1 httr_1.4.2 RColorBrewer_1.1-2
## [58] ellipsis_0.3.1 ff_4.0.2 farver_2.0.3
## [61] pkgconfig_2.0.3 XML_3.99-0.5 nnet_7.3-14
## [64] dbplyr_1.4.4 locfit_1.5-9.4 here_0.1
## [67] labeling_0.3 reshape2_1.4.4 tidyselect_1.1.0
## [70] rlang_0.4.7 munsell_0.5.0 cellranger_1.1.0
## [73] tools_4.0.2 downloader_0.4 cli_2.0.2
## [76] generics_0.0.2 RSQLite_2.2.0 broom_0.7.0
## [79] evaluate_0.14 yaml_2.2.1 bit64_4.0.5
## [82] fs_1.5.0 doRNG_1.8.2 xml2_1.3.2
## [85] biomaRt_2.44.1 BiocStyle_2.16.0 compiler_4.0.2
## [88] rstudioapi_0.11 curl_4.3 png_0.1-7
## [91] affyio_1.58.0 reprex_0.3.0 geneplotter_1.66.0
## [94] stringi_1.4.6 highr_0.8 GenomicFeatures_1.40.1
## [97] GenomicFiles_1.24.0 lattice_0.20-41 Matrix_1.2-18
## [100] vctrs_0.3.4 pillar_1.4.6 lifecycle_0.2.0
## [103] BiocManager_1.30.10 data.table_1.13.0 bitops_1.0-6
## [106] qvalue_2.20.0 rtracklayer_1.48.0 R6_2.4.1
## [109] latticeExtra_0.6-29 affy_1.66.0 gridExtra_2.3
## [112] affxparser_1.60.0 codetools_0.2-16 assertthat_0.2.1
## [115] openssl_1.4.2 rprojroot_1.3-2 withr_2.2.0
## [118] GenomicAlignments_1.24.0 Rsamtools_2.4.0 GenomeInfoDbData_1.2.3
## [121] hms_0.5.3 grid_4.0.2 rpart_4.1-15
## [124] rmarkdown_2.3 lubridate_1.7.9 base64enc_0.1-3
Time for this code chunk: 0.328958034515381